{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "be10f3a6-d2bc-45b4-98f6-f42c41cbc076",
   "metadata": {},
   "source": [
    "# Step 2. Training"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "502cf77c-b94f-40df-9fa3-7dcc3dd88334",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b590380a-f3af-4174-b857-e39e87c607e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install xgboost --quiet\n",
    "!pip install seaborn --quiet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45431dc3-b78c-4a17-98f1-de00d297aabe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "\n",
    "import xgboost as xgb\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import f1_score\n",
    "import time\n",
    "\n",
    "# Mute warnings\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35eeb3c8-4e69-48a7-bf4e-138d9eac19c4",
   "metadata": {},
   "source": [
    "## Connecting to Hopsworks Feature Store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20caeb77-5147-4ac7-9420-cf6618a0f3ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "import hopsworks\n",
    "\n",
    "from ipython_secrets import *\n",
    "KEY = get_secret('HOPSWORKS_API_KEY')\n",
    "project = hopsworks.login(host=\"c.app.hopsworks.ai\", api_key_value=KEY)\n",
    "\n",
    "fs = project.get_feature_store()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de186bea-150f-4e61-861a-979b6c41d28e",
   "metadata": {},
   "source": [
    "### Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3860294-db23-4b4f-9a7b-ee89cba0c80f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Retrieve feature groups\n",
    "trans_fg = fs.get_feature_group(\n",
    "    name='transactions_fraud_streaming_fg_' + str(project.id), \n",
    "    version=1,\n",
    ")\n",
    "window_aggs_fg = fs.get_feature_group(\n",
    "    name='transactions_aggs_fraud_streaming_fg_' + str(project.id), \n",
    "    version=1,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f5b4ed9-900d-4018-a039-3e88f27c3e79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select features for training data.\n",
    "selected_features = trans_fg.select([\"fraud_label\", \"category\", \"amount\", \"date_time\", \"age_at_transaction\", \"days_until_card_expires\"])\\\n",
    "    .join(window_aggs_fg.select_except([\"cc_num\", \"date_time\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa2831c8-04fc-4fdf-a803-f35590533b9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Uncomment this if you would like to view your selected features\n",
    "selected_features.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6036df5-16bc-462a-8dd0-ec4a1b66ef97",
   "metadata": {},
   "source": [
    "### Transformation Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc1777f5-8efe-46d6-98df-49fc11af8d88",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load transformation functions.\n",
    "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n",
    "\n",
    "# Map features to transformations.\n",
    "transformation_functions = {\n",
    "    \"category\": label_encoder,\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d87abb82-9b94-4299-9b01-7912b91c0858",
   "metadata": {},
   "source": [
    "## Feature View Creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e67780a-b922-47b6-bad2-ad4553f319ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get or create the 'transactions_view_fraud_batch_fv' feature view\n",
    "feature_view = fs.get_or_create_feature_view(\n",
    "    name='transactions_view_streaming_fv',\n",
    "    version=1,\n",
    "    query=selected_features,\n",
    "    labels=[\"fraud_label\"],\n",
    "    transformation_functions=transformation_functions,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f528cbf-f86f-44ff-905d-49a18468782f",
   "metadata": {},
   "source": [
    "## Training Dataset Creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbd505c7-23ba-4c16-923e-e4a1b01aea25",
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_SIZE = 0.2\n",
    "\n",
    "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n",
    "    test_size = TEST_SIZE\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "265ce2da-10ec-44d8-8f11-d607656bf07c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort the X_train DataFrame based on the \"date_time\" column in ascending order\n",
    "X_train = X_train.sort_values(\"date_time\")\n",
    "\n",
    "# Reindex the y_train Series to match the order of rows in the sorted X_train DataFrame\n",
    "y_train = y_train.reindex(X_train.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd455ade-ede6-4c05-99fd-7cf6fdce9488",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort the X_test DataFrame based on the \"date_time\" column in ascending order\n",
    "X_test = X_test.sort_values(\"date_time\")\n",
    "\n",
    "# Reindex the y_test Series to match the order of rows in the sorted X_test DataFrame\n",
    "y_test = y_test.reindex(X_test.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b89e79-03b8-45aa-8c16-6ba434f22519",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop the \"date_time\" column from the X_train DataFrame along the specified axis (axis=1 means columns)\n",
    "X_train.drop([\"date_time\"], axis=1, inplace=True)\n",
    "\n",
    "# Drop the \"date_time\" column from the X_test DataFrame along the specified axis (axis=1 means columns)\n",
    "X_test.drop([\"date_time\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fc90917-97d9-4169-ab30-e2fe10e9c282",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7a9ce25-c7e0-40a2-b45b-a6c6151d5ad5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display the normalized value counts of the y_train Series\n",
    "y_train.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87b52688-66f4-4fbc-85f9-95ccd344f343",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f18c3e61-64e0-4ad6-a39e-f742d95ea08b",
   "metadata": {},
   "source": [
    "## Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8c56fbd-d895-4ccf-906c-04340ae500af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an instance of the XGBClassifier\n",
    "clf = xgb.XGBClassifier()\n",
    "\n",
    "# Fit the classifier on the training data\n",
    "clf.fit(X_train.values, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a5ef656-a2aa-4caf-a015-15bb9304dad9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict the training data using the trained classifier\n",
    "y_pred_train = clf.predict(X_train.values)\n",
    "\n",
    "# Predict the test data using the trained classifier\n",
    "y_pred_test = clf.predict(X_test.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ed30b3b-c010-41d0-b847-259a0e8df31d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute f1 score\n",
    "metrics = {\n",
    "    \"f1_score\": f1_score(y_test, y_pred_test, average='macro')\n",
    "}\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba1059f9-8a29-4903-99ec-661b1f4e5fba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the confusion matrix using the true labels (y_test) and predicted labels (y_pred_test)\n",
    "results = confusion_matrix(y_test, y_pred_test)\n",
    "\n",
    "# Print the confusion matrix\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "260b78f6-dfa3-4ba6-80e6-43b7584fdc7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a DataFrame from the confusion matrix results with appropriate labels\n",
    "df_cm = pd.DataFrame(\n",
    "    results, \n",
    "    ['True Normal', 'True Fraud'],\n",
    "    ['Pred Normal', 'Pred Fraud'],\n",
    ")\n",
    "\n",
    "# Create a heatmap using seaborn with annotations\n",
    "cm = sns.heatmap(df_cm, annot=True)\n",
    "\n",
    "# Get the figure from the heatmap and display it\n",
    "fig = cm.get_figure()\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6758623-6bb5-4137-b43b-aa6db6348d4a",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6379c024-7f3a-42d5-9139-769338e578f0",
   "metadata": {},
   "source": [
    "## Model Schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fed2051c-c8f8-4db5-906a-31ed02ce8a8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from hsml.schema import Schema\n",
    "from hsml.model_schema import ModelSchema\n",
    "\n",
    "# Define the input schema using the values of X_train\n",
    "input_schema = Schema(X_train.values)\n",
    "\n",
    "# Define the output schema using y_train\n",
    "output_schema = Schema(y_train)\n",
    "\n",
    "# Create a ModelSchema object specifying the input and output schemas\n",
    "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)\n",
    "\n",
    "# Convert the model schema to a dictionary for further inspection or serialization\n",
    "model_schema.to_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71e5aa77-56b4-416f-922f-253c31ab8498",
   "metadata": {},
   "source": [
    "## Register Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebba47a3-d61f-4bd3-a936-860665d9d0a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify the directory where the model will be saved\n",
    "model_dir = \"fraud_streaming_model\"\n",
    "\n",
    "# Check if the directory exists, and create it if it doesn't\n",
    "if not os.path.isdir(model_dir):\n",
    "    os.mkdir(model_dir)\n",
    "\n",
    "# Save the trained XGBoost model using joblib\n",
    "joblib.dump(clf, model_dir + '/xgboost_fraud_streaming_model.pkl')\n",
    "\n",
    "# Save the confusion matrix heatmap as an image in the model directory\n",
    "fig.savefig(model_dir + \"/confusion_matrix.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc5ffac2-3552-475e-a1e6-15ef39ca3613",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the model registry\n",
    "mr = project.get_model_registry()\n",
    "\n",
    "# Create a new model in the model registry\n",
    "fraud_model = mr.python.create_model(\n",
    "    name=\"xgboost_fraud_streaming_model\",     # Name for the model\n",
    "    metrics=metrics,                      # Metrics used for evaluation\n",
    "    model_schema=model_schema,            # Schema defining the model's input and output\n",
    "    input_example=X_train.sample(),       # Example input data for reference\n",
    "    description=\"Fraud Batch Predictor\",  # Description of the model\n",
    ")\n",
    "\n",
    "# Save the model to the specified directory\n",
    "fraud_model.save(model_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efba81dc-7c8b-4fd1-9417-3d9b7b423d5e",
   "metadata": {},
   "source": [
    "## Model Deployment"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19d3e6df-5554-440e-b762-61d0671e40d0",
   "metadata": {},
   "source": [
    "### Predictor script for Python Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3b6e9c5-136c-4d06-9042-96cb5e636ac0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile predict_example.py\n",
    "import os\n",
    "import numpy as np\n",
    "import hsfs\n",
    "import joblib\n",
    "\n",
    "\n",
    "class Predict(object):\n",
    "\n",
    "    def __init__(self):\n",
    "        \"\"\" Initializes the serving state, reads a trained model\"\"\"        \n",
    "        # Get feature store handle\n",
    "        fs_conn = hsfs.connection()\n",
    "        self.fs = fs_conn.get_feature_store()\n",
    "        \n",
    "        # Get feature view\n",
    "        self.fv = self.fs.get_feature_view(\"transactions_view_streaming_fv\", 1)\n",
    "        \n",
    "        # Initialize serving\n",
    "        self.fv.init_serving(1)\n",
    "\n",
    "        # Load the trained model\n",
    "        self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/xgboost_fraud_streaming_model.pkl\")\n",
    "        print(\"Initialization Complete\")\n",
    "\n",
    "    def predict(self, inputs):\n",
    "        \"\"\" Serves a prediction request usign a trained model\"\"\"\n",
    "        feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]}, return_type=\"pandas\").drop([\"date_time\"], axis=1).values\n",
    "        return self.model.predict(feature_vector.reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f89da0a8-59b5-4e6d-bfc1-b438a7ef3770",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the dataset API for the current project\n",
    "dataset_api = project.get_dataset_api()\n",
    "\n",
    "# Specify the local file path of the Python script to be uploaded\n",
    "local_script_path = \"predict_example.py\"\n",
    "\n",
    "# Upload the Python script to the \"Models\", and overwrite if it already exists\n",
    "uploaded_file_path = dataset_api.upload(local_script_path, \"Models\", overwrite=True)\n",
    "\n",
    "# Create the full path to the uploaded script for future reference\n",
    "predictor_script_path = os.path.join(\"/Projects\", project.name, uploaded_file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c260759d-f461-4aa3-996c-4e703b5d8992",
   "metadata": {},
   "source": [
    "### Create the deployment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f395310f-7e52-42ba-9931-53d42099f5ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Deploy the fraud model\n",
    "deployment = fraud_model.deploy(\n",
    "    name=\"fraudonlinemodeldeployment\",  # Specify a name for the deployment\n",
    "    script_file=predictor_script_path,  # Provide the path to the Python script for prediction\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4be497f-b68e-4975-9dc0-18ca8236912e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print the name of the deployment\n",
    "print(\"Deployment: \" + deployment.name)\n",
    "\n",
    "# Display information about the deployment\n",
    "deployment.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d882d79c-d068-4e1b-8470-5d98371ac831",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Deployment is warming up...\")\n",
    "time.sleep(45)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f1f0438-f12c-492c-af9a-13f134531544",
   "metadata": {},
   "source": [
    "#### The deployment has now been registered. However, to start it you need to run the following command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e2c13c7-9b18-4a5c-b57c-ca4000f6fc63",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start the deployment and wait for it to be in a running state for up to 300 seconds\n",
    "deployment.start(await_running=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98896769-5513-4aee-b156-644d1b6346f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the current state of the deployment\n",
    "deployment.get_state().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b03a8dda-013d-4f00-89ef-eba17fa1d8dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# To troubleshoot you can use `get_logs()` method\n",
    "deployment.get_logs(component='predictor')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74d76f8e-c443-4b1b-a9be-e72463240640",
   "metadata": {},
   "source": [
    "### Stop Deployment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b445eda9-1312-4a17-bd0d-7499d8402af8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stop the deployment and wait for it to be in a stopped state for up to 180 seconds\n",
    "deployment.stop(await_stopped=180)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f842544-0507-4bc7-898b-7d55c533932c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
